# Importamos las librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
# Leemos la tabla
df_msf_union = pd.read_csv('Data/df_union.csv.gz', compression='gzip')
df_msf_union.head(3)
| rec_id | msf_annualizedquota__c | msf_cancelationdate__c | msf_currentleadsource1__c | msf_currentquotamodification__c | msf_leadsource1__c | msf_memberid__c | npe03__contact__c | npe03__date_established__c | npe03__installment_period__c | npe03__recurring_donation_campaign__c | msf_birthyear__c | Gender__c | msf_LTVCont__c | msf_PressureComplaint__c | msf_ScoringRFVRecurringDonor__c | msf_AverageDonorValue__c | con_id | msf_EntryCampaign__c | msf_FirstCampaignEntryRecurringDonor__c | msf_FirstCampaingColaboration__c | quo_id | isdeleted | msf_recurringdonation__c | msf_campaigninfluence__c | msf_changeannualizedquota__c | msf_changetype__c | msf_leadsource3__c | msf_newannualizedquota__c | msf_newrecurringperiod__c | msf_contactid__c | msf_changedate__c | flag_aumento | year_activo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | a093Y00001RZ7YyQAL | 180.0 | 2018-06-01 | Email a SAS | a1y3Y000002PwXAQA0 | Email a SAS | 10916097 | 0033Y00002uo1H5QAI | 2015-12-03 | Monthly | 7013Y000001mrBLQAY | NaN | Female | 830.0 | False | 0.6 | NaN | 0033Y00002uo1H5QAI | 7013Y000001mr30QAA | 7013Y000001mr30QAA | 7013Y000001mr30QAA | a1y3Y000002PwXAQA0 | False | a093Y00001RZ7YyQAL | 7013Y000001mrBLQAY | 180.0 | Initial | Online | 180.0 | Monthly | 0033Y00002uo1H5QAI | 2015-12-03 | 0 | 2015 |
| 1 | a093Y00001RZ7ZuQAL | 60.0 | 2019-05-07 | Persona a persona | a1y3Y000002RPGyQAO | Persona a persona | 11132307 | 0033Y00002uooTFQAY | 2015-02-19 | Yearly | 7013Y000001mr2cQAA | 1957.0 | Male | 300.0 | False | 1.8 | NaN | 0033Y00002uooTFQAY | 7013Y000001mr2cQAA | 7013Y000001mr2cQAA | 7013Y000001mr2cQAA | a1y3Y000002RPGyQAO | False | a093Y00001RZ7ZuQAL | 7013Y000001mr2cQAA | 60.0 | Initial | Personal | 60.0 | Yearly | 0033Y00002uooTFQAY | 2015-02-19 | 0 | 2015 |
| 2 | a093Y00001RZ7a2QAD | 300.0 | 2019-01-07 | Persona a persona | a1y3Y000002RCV1QAO | Persona a persona | 11156602 | 0033Y00002up0LKQAY | 2015-04-07 | Monthly | 7013Y000001mr2DQAQ | 1988.0 | Male | 1125.0 | False | 1.5 | NaN | 0033Y00002up0LKQAY | 7013Y000001mr2DQAQ | 7013Y000001mr2DQAQ | 7013Y000001mr2DQAQ | a1y3Y000002RCV1QAO | False | a093Y00001RZ7a2QAD | 7013Y000001mr2DQAQ | 300.0 | Initial | Personal | 300.0 | Monthly | 0033Y00002up0LKQAY | 2015-04-07 | 0 | 2015 |
# Tamaño bbdd
df_msf_union.shape
(597853, 34)
# Mostrar las columnas y sus tipos
print("Columnas y sus tipos:")
print(df_msf_union.dtypes)
Columnas y sus tipos: rec_id object msf_annualizedquota__c float64 msf_cancelationdate__c object msf_currentleadsource1__c object msf_currentquotamodification__c object msf_leadsource1__c object msf_memberid__c int64 npe03__contact__c object npe03__date_established__c object npe03__installment_period__c object npe03__recurring_donation_campaign__c object msf_birthyear__c float64 Gender__c object msf_LTVCont__c float64 msf_PressureComplaint__c bool msf_ScoringRFVRecurringDonor__c float64 msf_AverageDonorValue__c float64 con_id object msf_EntryCampaign__c object msf_FirstCampaignEntryRecurringDonor__c object msf_FirstCampaingColaboration__c object quo_id object isdeleted bool msf_recurringdonation__c object msf_campaigninfluence__c object msf_changeannualizedquota__c float64 msf_changetype__c object msf_leadsource3__c object msf_newannualizedquota__c float64 msf_newrecurringperiod__c object msf_contactid__c object msf_changedate__c object flag_aumento int64 year_activo int64 dtype: object
# Hay columnas que tiene datos con relación a fechas pero no son de tipo fecha, las convertimos.
def cast_as_datetime(dataframe):
cols = dataframe.columns
datecols = [col for col in cols if "date" in col]
for col in datecols:
dataframe[col] = pd.to_datetime(dataframe[col], format='%Y/%m/%d')
return dataframe
#Llamamos a la función creada:
df_msf_union = cast_as_datetime(df_msf_union)
# Transformamos fecha de nacimiento en edad
currentYear = 2023
df_msf_union["age"]= currentYear - df_msf_union["msf_birthyear__c"]
df_msf_union.head(2)
| rec_id | msf_annualizedquota__c | msf_cancelationdate__c | msf_currentleadsource1__c | msf_currentquotamodification__c | msf_leadsource1__c | msf_memberid__c | npe03__contact__c | npe03__date_established__c | npe03__installment_period__c | npe03__recurring_donation_campaign__c | msf_birthyear__c | Gender__c | msf_LTVCont__c | msf_PressureComplaint__c | msf_ScoringRFVRecurringDonor__c | msf_AverageDonorValue__c | con_id | msf_EntryCampaign__c | msf_FirstCampaignEntryRecurringDonor__c | msf_FirstCampaingColaboration__c | quo_id | isdeleted | msf_recurringdonation__c | msf_campaigninfluence__c | msf_changeannualizedquota__c | msf_changetype__c | msf_leadsource3__c | msf_newannualizedquota__c | msf_newrecurringperiod__c | msf_contactid__c | msf_changedate__c | flag_aumento | year_activo | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | a093Y00001RZ7YyQAL | 180.0 | 2018-06-01 | Email a SAS | a1y3Y000002PwXAQA0 | Email a SAS | 10916097 | 0033Y00002uo1H5QAI | 2015-12-03 | Monthly | 7013Y000001mrBLQAY | NaN | Female | 830.0 | False | 0.6 | NaN | 0033Y00002uo1H5QAI | 7013Y000001mr30QAA | 7013Y000001mr30QAA | 7013Y000001mr30QAA | a1y3Y000002PwXAQA0 | False | a093Y00001RZ7YyQAL | 7013Y000001mrBLQAY | 180.0 | Initial | Online | 180.0 | Monthly | 0033Y00002uo1H5QAI | 2015-12-03 | 0 | 2015 | NaN |
| 1 | a093Y00001RZ7ZuQAL | 60.0 | 2019-05-07 | Persona a persona | a1y3Y000002RPGyQAO | Persona a persona | 11132307 | 0033Y00002uooTFQAY | 2015-02-19 | Yearly | 7013Y000001mr2cQAA | 1957.0 | Male | 300.0 | False | 1.8 | NaN | 0033Y00002uooTFQAY | 7013Y000001mr2cQAA | 7013Y000001mr2cQAA | 7013Y000001mr2cQAA | a1y3Y000002RPGyQAO | False | a093Y00001RZ7ZuQAL | 7013Y000001mr2cQAA | 60.0 | Initial | Personal | 60.0 | Yearly | 0033Y00002uooTFQAY | 2015-02-19 | 0 | 2015 | 66.0 |
# Leemos la tabla de contactos
data_contact = pd.read_parquet("Descarga tablas/MSF_Contact.parquet")
# Realizar un LEFT JOIN en base a la columna común para pegar el campo provincia y seniority
merged_df = pd.merge(df_msf_union, data_contact[['id', 'mailingstate','msf_seniority__c']], left_on='msf_contactid__c', right_on='id', how='left')
# Transformamos el campo mailingstate según nuestro criterio
# Define las condiciones
condicion1 = merged_df['mailingstate'].str.lower().str.contains(r'bar.*ona|lle.*da|le.*da|tarr.*|gi.*ona|ge.*ona|catal.*|bad.*ona|ter.*ona|ter.*sa', case=False, na=False)
condicion2 = merged_df['mailingstate'].str.lower().str.contains('madrid|madird', case=False, na=False)
condicion3 = merged_df['mailingstate'].str.lower().str.contains(r'sev.*', case=False, na=False)
condicion4 = merged_df['mailingstate'].str.lower().str.contains(r'pa.*co|gu.*coa|g.*k|vi.*y|bi.*y|vi.*k|bi.*k|alava.*|alaba.*|san.* se|dono.*|vit.*', case=False, na=False)
condicion5 = merged_df['mailingstate'].str.lower().str.contains(r'va.*ia', case=False, na=False)
# valores correspondientes para cada condición
valor1 = 'Cataluña'
valor2 = 'Madrid'
valor3 = 'Sevilla'
valor4 = 'Pais Vasco'
valor5 = 'Valencia'
# Valor predeterminado si ninguna de las condiciones se cumple
valor_predeterminado = 'Resto'
# asignar valores basados en las condiciones
merged_df['Provincia2'] = np.select([condicion1, condicion2, condicion3, condicion4,condicion5], [valor1, valor2, valor3, valor4,valor5], default=valor_predeterminado)
# Reemplazar los valores en blanco ('') con NaN
merged_df.replace('', np.nan, inplace=True)
# Calcular el porcentaje de valores nulos (incluyendo los valores en blanco)
porcentaje_nulos = merged_df.isnull().mean() * 100
# Mostrar el porcentaje de valores nulos (incluyendo los valores en blanco)
print(porcentaje_nulos)
rec_id 0.000000 msf_annualizedquota__c 0.000000 msf_cancelationdate__c 61.883440 msf_currentleadsource1__c 0.029271 msf_currentquotamodification__c 0.118758 msf_leadsource1__c 0.010705 msf_memberid__c 0.000000 npe03__contact__c 0.000000 npe03__date_established__c 0.000000 npe03__installment_period__c 0.215605 npe03__recurring_donation_campaign__c 0.000167 msf_birthyear__c 18.685864 Gender__c 1.039720 msf_LTVCont__c 0.819599 msf_PressureComplaint__c 0.000000 msf_ScoringRFVRecurringDonor__c 0.000000 msf_AverageDonorValue__c 74.653970 con_id 0.000000 msf_EntryCampaign__c 0.007861 msf_FirstCampaignEntryRecurringDonor__c 0.007694 msf_FirstCampaingColaboration__c 0.476539 quo_id 0.000000 isdeleted 0.000000 msf_recurringdonation__c 0.000000 msf_campaigninfluence__c 0.034122 msf_changeannualizedquota__c 0.000000 msf_changetype__c 0.000000 msf_leadsource3__c 2.384533 msf_newannualizedquota__c 0.000000 msf_newrecurringperiod__c 0.016894 msf_contactid__c 0.000000 msf_changedate__c 0.030442 flag_aumento 0.000000 year_activo 0.000000 age 18.685864 id 0.000000 mailingstate 3.037536 msf_seniority__c 0.000000 Provincia2 0.000000 dtype: float64
# Borramos columnas que no consideramos necesarias para el modelo
drop_columns = ['msf_campaigninfluence__c','msf_memberid__c', 'npe03__contact__c', 'msf_campaigninfluence__c', 'msf_birthyear__c','msf_FirstCampaignEntryRecurringDonor__c','mailingstate',
'isdeleted', 'msf_EntryCampaign__c','msf_EntryCampaign__c','msf_FirstCampaingColaboration__c', 'msf_campaigninfluence__c','quo_id', 'msf_contactid__c','msf_changetype__c','msf_changedate__c',
'msf_recurringdonation__c','msf_recurringdonation__c', 'npe03__recurring_donation_campaign__c','id','msf_cancelationdate__c','npe03__date_established__c','msf_currentquotamodification__c','msf_birthyear__c']
merged_df_f = merged_df.drop(drop_columns, axis=1)
# Creamos un nuevo campo como índice actual del DataFrame
merged_df_f["index"] = merged_df_f.index
# Obtenemos un resumen estadístico de las columnas numéricas
merged_df_f.describe()
| msf_annualizedquota__c | msf_LTVCont__c | msf_ScoringRFVRecurringDonor__c | msf_AverageDonorValue__c | msf_changeannualizedquota__c | msf_newannualizedquota__c | flag_aumento | year_activo | age | msf_seniority__c | index | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 597853.000000 | 592953.000000 | 597853.000000 | 151532.000000 | 597853.000000 | 597853.000000 | 597853.000000 | 597853.000000 | 486139.000000 | 597853.000000 | 597853.00000 |
| mean | 166.898305 | 903.425773 | 3.142453 | 60.320070 | 104.074235 | 151.429510 | 0.105703 | 2017.319150 | 55.006910 | 6.957332 | 298926.00000 |
| std | 199.097233 | 1310.529636 | 1.602761 | 174.761642 | 224.515616 | 216.043149 | 0.307458 | 1.903696 | 16.224429 | 4.058340 | 172585.43958 |
| min | 0.000000 | 0.010000 | 0.000000 | 0.010000 | 0.000000 | 0.000000 | 0.000000 | 2015.000000 | 0.000000 | 0.000000 | 0.00000 |
| 25% | 84.000000 | 294.000000 | 1.600000 | 20.000000 | 48.000000 | 72.000000 | 0.000000 | 2016.000000 | 43.000000 | 5.000000 | 149463.00000 |
| 50% | 132.000000 | 627.000000 | 3.500000 | 30.440000 | 72.000000 | 120.000000 | 0.000000 | 2017.000000 | 55.000000 | 7.000000 | 298926.00000 |
| 75% | 204.000000 | 1135.000000 | 4.500000 | 60.000000 | 120.000000 | 180.000000 | 0.000000 | 2019.000000 | 67.000000 | 8.000000 | 448389.00000 |
| max | 72000.000000 | 199150.000000 | 6.700000 | 18000.000000 | 72000.000000 | 72000.000000 | 1.000000 | 2021.000000 | 311.000000 | 37.000000 | 597852.00000 |
#Veamos la distribución de las variables númericas:
for variable in merged_df_f.select_dtypes(include='number'):
fig = merged_df_f[variable].hist(bins=50)
fig.set_title(f"Distribución de la variable {variable}")
fig.set_xlabel(f"{variable}")
fig.set_ylabel('Número')
plt.show()
merged_df_f["age"] = merged_df_f["age"].replace({np.nan:merged_df_f["age"].median()})
merged_df_f["msf_LTVCont__c"] = merged_df_f["msf_LTVCont__c"].replace({np.nan:merged_df_f["msf_LTVCont__c"].median()})
merged_df_f["msf_AverageDonorValue__c"] = merged_df_f["msf_AverageDonorValue__c"].replace({np.nan:merged_df_f["msf_AverageDonorValue__c"].median()})
# Observamos si queda algún campo numérico con valores nulos:
# Reemplazar los valores en blanco ('') con NaN
merged_df_f.replace('', np.nan, inplace=True)
# Calcular el porcentaje de valores nulos (incluyendo los valores en blanco)
porcentaje_nulos = merged_df_f.select_dtypes(include='number').isnull().mean() * 100
# Mostrar el porcentaje de valores nulos (incluyendo los valores en blanco)
print(porcentaje_nulos)
msf_annualizedquota__c 0.0 msf_LTVCont__c 0.0 msf_ScoringRFVRecurringDonor__c 0.0 msf_AverageDonorValue__c 0.0 msf_changeannualizedquota__c 0.0 msf_newannualizedquota__c 0.0 flag_aumento 0.0 year_activo 0.0 age 0.0 msf_seniority__c 0.0 index 0.0 dtype: float64
# Veamos los valores outliers:
numeric_cols =[col for col in merged_df_f.columns]
for col in numeric_cols:
if merged_df_f[col].dtype in [int, float]:
fig = px.histogram(merged_df_f[numeric_cols], x=col, title=f"Histogram for column {col}")
# Comprobación de valores atípicos
mean = np.mean(merged_df_f[col])
std = np.std(merged_df_f[col])
print(f"Mean for col {col} is {mean}")
print(f"Standard deviation for col {col} is {std}")
atipicos_leves_sup = mean + 1.5*std
atipicos_leves_inf = mean - 1.5*std
fig.add_vrect(x0=atipicos_leves_inf, x1=atipicos_leves_sup,
annotation_text="tipicos", annotation_position="top left",
fillcolor="green", opacity=0.25, line_width=0)
fig.show()
print("\n")
Mean for col msf_annualizedquota__c is 166.89830468359273 Standard deviation for col msf_annualizedquota__c is 199.09706683768385
Mean for col msf_LTVCont__c is 901.1601888591342 Standard deviation for col msf_LTVCont__c is 1305.3848723785452
Mean for col msf_ScoringRFVRecurringDonor__c is 3.1424530779151527 Standard deviation for col msf_ScoringRFVRecurringDonor__c is 1.6027595352130293
Mean for col msf_AverageDonorValue__c is 38.013411424062774 Standard deviation for col msf_AverageDonorValue__c is 88.93805383830966
Mean for col msf_changeannualizedquota__c is 104.07423477008379 Standard deviation for col msf_changeannualizedquota__c is 224.51542827376957
Mean for col msf_newannualizedquota__c is 151.42951016386814 Standard deviation for col msf_newannualizedquota__c is 216.04296862393127
Mean for col age is 55.005618437977226 Standard deviation for col age is 14.630258690073589
Mean for col msf_seniority__c is 6.957332320821339 Standard deviation for col msf_seniority__c is 4.058336251907183
# Ahora tratamos los campos tipo categóricos:
merged_df_f.describe(include='object')
| rec_id | msf_currentleadsource1__c | msf_leadsource1__c | npe03__installment_period__c | Gender__c | con_id | msf_leadsource3__c | msf_newrecurringperiod__c | Provincia2 | |
|---|---|---|---|---|---|---|---|---|---|
| count | 597853 | 597678 | 597789 | 596564 | 591637 | 597853 | 583597 | 597752 | 597853 |
| unique | 378633 | 24 | 18 | 5 | 3 | 357693 | 6 | 5 | 6 |
| top | a093Y00001SrXozQAF | Telemarketing | Persona a persona | Monthly | Female | 0033Y00002uNbs3QAC | Teléfono | Monthly | Resto |
| freq | 68 | 365606 | 270635 | 488413 | 343659 | 68 | 328333 | 493415 | 321980 |
variables_object = ['msf_currentleadsource1__c','msf_leadsource1__c','npe03__installment_period__c','Gender__c','msf_leadsource3__c','msf_newrecurringperiod__c','Provincia2']
for variable in variables_object:
merged_df_f[variable].value_counts(ascending=True,normalize=True).tail(20).plot.barh()
plt.show()
# Modificamos el valor de algunos campos en base a los valores mayoritarios para no tener tantas opciones
def agrup_currentleadsource(x):
if x == "Telemarketing":
return "Telemarketing"
if x == "Persona a persona":
return "Persona a persona"
if x == "Web MSF":
return "Web MSF"
if x == "Personal con tablet":
return "Personal con tablet"
if x == "Teléfono campaña":
return "Teléfono campaña"
else:
return 'Resto' # Si no coincide con ninguna categoría, se asigna 'Resto'
# Aplicamos la función a dos campos
merged_df_f['agrup_currentleadsource'] = merged_df_f['msf_currentleadsource1__c'].apply(agrup_currentleadsource)
merged_df_f['agrup_leadsource'] = merged_df_f['msf_leadsource1__c'].apply(agrup_currentleadsource)
# Modificamos el valor del campo gender para unificarlo
def agrup_gender(x):
if x == "Female":
return "Female"
if x == "Male":
return "Male"
if x == "M":
return "Female"
if x == "H":
return "Male"
else:
return 'Resto'
merged_df_f['agrup_gender'] = merged_df_f['Gender__c'].apply(agrup_gender)
# Sustituimos los valores nulos por los valores siguientes:
merged_df_f2 = merged_df_f.fillna(method='ffill', axis=0).fillna(0)
# Borramos las variables agrupadas de la función de antes
merged_df_f2 = merged_df_f2.drop(['msf_currentleadsource1__c','msf_leadsource1__c','Gender__c'], axis=1)
# Lista de columnas para las que se crearán variables ficticias
var_dummies = ['npe03__installment_period__c','msf_PressureComplaint__c','agrup_gender','msf_leadsource3__c','msf_newrecurringperiod__c','Provincia2',
'agrup_currentleadsource', 'agrup_leadsource']
merged_df_f2 = pd.get_dummies(merged_df_f2, columns = var_dummies, drop_first = True)
merged_df_f2.head()
| rec_id | msf_annualizedquota__c | msf_LTVCont__c | msf_ScoringRFVRecurringDonor__c | msf_AverageDonorValue__c | con_id | msf_changeannualizedquota__c | msf_newannualizedquota__c | flag_aumento | year_activo | age | msf_seniority__c | index | npe03__installment_period__c_Monthly | npe03__installment_period__c_Quarterly | npe03__installment_period__c_Semestral | npe03__installment_period__c_Yearly | msf_PressureComplaint__c_True | agrup_gender_Male | agrup_gender_Resto | msf_leadsource3__c_Entidad financiera | msf_leadsource3__c_Online | msf_leadsource3__c_Otro | msf_leadsource3__c_Personal | msf_leadsource3__c_Teléfono | msf_newrecurringperiod__c_Monthly | msf_newrecurringperiod__c_Quarterly | msf_newrecurringperiod__c_Semestral | msf_newrecurringperiod__c_Yearly | Provincia2_Madrid | Provincia2_Pais Vasco | Provincia2_Resto | Provincia2_Sevilla | Provincia2_Valencia | agrup_currentleadsource_Personal con tablet | agrup_currentleadsource_Resto | agrup_currentleadsource_Telemarketing | agrup_currentleadsource_Teléfono campaña | agrup_currentleadsource_Web MSF | agrup_leadsource_Personal con tablet | agrup_leadsource_Resto | agrup_leadsource_Telemarketing | agrup_leadsource_Teléfono campaña | agrup_leadsource_Web MSF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | a093Y00001RZ7YyQAL | 180.0 | 830.0 | 0.6 | 30.44 | 0033Y00002uo1H5QAI | 180.0 | 180.0 | 0 | 2015 | 55.0 | 9.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | a093Y00001RZ7ZuQAL | 60.0 | 300.0 | 1.8 | 30.44 | 0033Y00002uooTFQAY | 60.0 | 60.0 | 0 | 2015 | 66.0 | 8.0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | a093Y00001RZ7a2QAD | 300.0 | 1125.0 | 1.5 | 30.44 | 0033Y00002up0LKQAY | 300.0 | 300.0 | 0 | 2015 | 35.0 | 8.0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | a093Y00001RZ7aJQAT | 360.0 | 1200.0 | 1.5 | 30.44 | 0033Y00002up3uJQAQ | 360.0 | 360.0 | 0 | 2015 | 64.0 | 8.0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | a093Y00001RZ7aOQAT | 120.0 | 380.0 | 1.5 | 30.44 | 0033Y00002upNLQQA2 | 120.0 | 120.0 | 0 | 2015 | 32.0 | 8.0 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Guardamos la tabla resultante
merged_df_f2.to_csv('Data/df_inicial.csv')
# Eliminamos los campos con el id de la tabla contacto y de donaciones recurrentes:
merged_df_f2 = merged_df_f2.drop(["con_id", "rec_id"], axis = 1)
# Calculamos la matriz de correlación entre todas las columnas
correlation = merged_df_f2.iloc[:,:].corr(method='pearson')
# Las visualizamos de forma ascendente
correlation.flag_aumento.abs().sort_values(ascending=False)[1:]
msf_leadsource3__c_Teléfono 0.266303 msf_leadsource3__c_Personal 0.221631 agrup_currentleadsource_Telemarketing 0.187320 msf_leadsource3__c_Online 0.085210 msf_changeannualizedquota__c 0.084690 agrup_currentleadsource_Web MSF 0.054751 agrup_currentleadsource_Personal con tablet 0.054029 msf_ScoringRFVRecurringDonor__c 0.052900 npe03__installment_period__c_Monthly 0.052376 msf_newrecurringperiod__c_Monthly 0.051817 msf_newrecurringperiod__c_Yearly 0.048153 npe03__installment_period__c_Yearly 0.047969 age 0.043269 msf_annualizedquota__c 0.041333 year_activo 0.040818 agrup_currentleadsource_Resto 0.039388 msf_newannualizedquota__c 0.038299 index 0.035616 msf_LTVCont__c 0.024995 agrup_leadsource_Telemarketing 0.022559 msf_seniority__c 0.021760 agrup_currentleadsource_Teléfono campaña 0.020592 agrup_leadsource_Web MSF 0.017577 npe03__installment_period__c_Quarterly 0.015872 msf_newrecurringperiod__c_Quarterly 0.015124 npe03__installment_period__c_Semestral 0.011372 msf_newrecurringperiod__c_Semestral 0.010166 agrup_leadsource_Resto 0.009675 Gender__c_Male 0.008489 Gender__c_Male 0.008489 Gender__c_Other 0.007456 Gender__c_Other 0.007456 agrup_leadsource_Personal con tablet 0.006569 msf_PressureComplaint__c_True 0.005828 msf_leadsource3__c_Otro 0.005809 agrup_leadsource_Teléfono campaña 0.003745 Provincia2_Sevilla 0.003694 msf_leadsource3__c_Entidad financiera 0.003167 Provincia2_Valencia 0.002847 Provincia2_Resto 0.002601 Provincia2_Pais Vasco 0.002360 msf_AverageDonorValue__c 0.002353 Provincia2_Madrid 0.000243 Name: flag_aumento, dtype: float64
# Utilizamos Seaborn para crear un gráfico de barras que muestra la distribución de la variable objetivo
sns.countplot(x="flag_aumento", data=merged_df_f2)
<AxesSubplot: xlabel='flag_aumento', ylabel='count'>
# Para ver los datos númericos de la distribución obtenida
merged_df_f2["flag_aumento"].value_counts()
0 534658 1 63195 Name: flag_aumento, dtype: int64
Hay bastante desbalancea en la variable objetivo --> Lo tratamos
# Separamos nuestro dataset en variables independientes (X) y la variable objetivo (y)
X = merged_df_f2.drop("flag_aumento", axis = 1)
y = merged_df_f2["flag_aumento"]
# Aplicamos la técnica de oversampling utilizando SMOTE para equilibrar las clases
oversample = SMOTE(sampling_strategy=0.6)
X_smote, y_smote = oversample.fit_resample(X, y)
# Creamos un nuevo DataFrame que contiene las características equilibradas y la variable objetivo
dataset_smote = pd.DataFrame(X_smote)
dataset_smote["flag_aumento"] = y_smote
# Vemos la distribución del dataset tratado
sns.countplot(x="flag_aumento", data=dataset_smote)
<AxesSubplot: xlabel='flag_aumento', ylabel='count'>
# Guardamos la tabla resultante que será con la que probemos el modelo
dataset_smote.to_csv('Data/df_balanceada.csv')
# Leemos la tabla
# dataset_smote = pd.read_csv('Data/df_balanceada.csv')
#Eliminamos de X la variable objetivo:
X = dataset_smote.drop(["flag_aumento","index"], axis = 1)
y = dataset_smote['flag_aumento']
# Separamos el dataset en conjunto de entrenamiento y trest:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.20, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)
(684361, 40) (171091, 40) (684361,) (171091,)
# Aplicamos la estandarización a las características del conjunto de entrenamiento (X_train) y del conjunto de prueba (X_test)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Probamos con un modelo de Regresión Logística
classifier = LogisticRegression(random_state = 0)
# Entrenamiento del modelo
classifier.fit(X_train, Y_train)
LogisticRegression(random_state=0)
# Realizamos predicciones en el conjunto de prueba utilizando el clasificador entrenado
y_pred = classifier.predict(X_test)
# Evaluamos el rendimiento del modelo
print(classification_report(Y_test, y_pred))
precision recall f1-score support
0 0.90 0.90 0.90 107137
1 0.83 0.84 0.83 63954
accuracy 0.88 171091
macro avg 0.87 0.87 0.87 171091
weighted avg 0.88 0.88 0.88 171091
# Mostramos la matriz de confusión que compara las predicciones con las etiquetas reales
pd.DataFrame(
confusion_matrix(Y_test, y_pred),
columns=['Predicción aumenta cuota', 'Predicción No aumenta cuota'],
index =['Verdadero aumenta cuota', 'Verdadero No aumenta cuota']
)
| Predicción aumenta cuota | Predicción No aumenta cuota | |
|---|---|---|
| Verdadero aumenta cuota | 96244 | 10893 |
| Verdadero No aumenta cuota | 10467 | 53487 |
# Calculamos y mostramos la precisión del modelo
accuracy_score(Y_test, y_pred)
0.8751541577289279
# Calculamos y mostramos el área bajo la curva ROC del modelo
roc_auc = roc_auc_score(Y_test, y_pred)
roc_auc
0.8673309664898703
import matplotlib.pyplot as plt
# Calculamos la gráfica de dicha curva
fpr, tpr, threshold = roc_curve(Y_test, y_pred)
plt.title("Area bajo la curva")
plt.plot(fpr, tpr, label = "AUC = %0.2f"% roc_auc)
plt.legend(loc = "lower right")
plt.plot([0, 1], "r--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("Ratio verdaderos positivos")
plt.xlabel("Ratio falsos positivos")
plt.show()
e) Mejora los resultados optimizando los parámetros.
Usamos GridSearch sobre la Regresión Logística que es la que nos ha dado mejor accuracy
# Separamos el conjunto de entrenamiento en Train y test:
X = dataset_nearmiss.drop("flag_aumento", axis = 1)
y = dataset_nearmiss['flag_aumento']
# Seleccionamos las variables predictoras y la variable objetivo
# Dividimos el conjunto de entrenamiento en train y val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2)
# Normalizamos:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_val = sc_X.transform(X_val)
classifier = LogisticRegression(random_state = 0)
# Definimos el rango de parámetros y decidimos que gridsearch pruebe con los hiperparámetros C y penalty
param_grid = {'C': np.linspace(0.1, 2, 100), 'penalty' : ['l2', 'l1']}
#Definimos el seleccionador de modelos
# Con esto, GridSearch va a probar con el modelo de regresión logística modificando los hiperparámetros
# definidos en param_grid
grid = GridSearchCV(classifier, param_grid)
# Entrenamos para extraer el mejor modelo
grid.fit(X_train, y_train)
GridSearchCV(estimator=LogisticRegression(random_state=0),
param_grid={'C': array([0.1 , 0.11919192, 0.13838384, 0.15757576, 0.17676768,
0.1959596 , 0.21515152, 0.23434343, 0.25353535, 0.27272727,
0.29191919, 0.31111111, 0.33030303, 0.34949495, 0.36868687,
0.38787879, 0.40707071, 0.42626263, 0.44545455, 0.46464646,
0.48383838, 0.5030303 , 0.52222222, 0.54141414, 0.56060606,
0.5...
1.34747475, 1.36666667, 1.38585859, 1.40505051, 1.42424242,
1.44343434, 1.46262626, 1.48181818, 1.5010101 , 1.52020202,
1.53939394, 1.55858586, 1.57777778, 1.5969697 , 1.61616162,
1.63535354, 1.65454545, 1.67373737, 1.69292929, 1.71212121,
1.73131313, 1.75050505, 1.76969697, 1.78888889, 1.80808081,
1.82727273, 1.84646465, 1.86565657, 1.88484848, 1.9040404 ,
1.92323232, 1.94242424, 1.96161616, 1.98080808, 2. ]),
'penalty': ['l2', 'l1']})
grid.best_params_ #Para ver los mejores parámetros
{'C': 1.6545454545454545, 'penalty': 'l2'}
grid_prediction = grid.best_estimator_.predict(X_val)
print(classification_report(y_val, grid_prediction))
precision recall f1-score support
0 0.70 0.74 0.72 12710
1 0.72 0.68 0.70 12568
accuracy 0.71 25278
macro avg 0.71 0.71 0.71 25278
weighted avg 0.71 0.71 0.71 25278
# Pintamos la matriz de confusión
confusion_matrix(y_val, grid_prediction)
array([[9396, 3314],
[4048, 8520]], dtype=int64)
# Mejoramos un resultado frente a la matriz de confusión anterior
accuracy_score(y_val, grid_prediction)
0.7087586043199621
roc_auc_score(y_val, grid_prediction)
0.708586291361774
# Calculamos lo que pasamos a la gráfica
fpr, tpr, threshold = roc_curve(y_val, grid_prediction)
roc_auc = roc_auc_score(y_val, grid_prediction)
import matplotlib.pyplot as plt
plt.title("Area bajo la curva")
plt.plot(fpr, tpr, label = "AUC = %0.2f"% roc_auc)
plt.legend(loc = "lower right")
plt.plot([0, 1], "r--")
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel("Ratio verdaderos positivos")
plt.xlabel("Ratio falsos positivos")
plt.show()